\subsection{UPC Collective Utilities {\tt <upc\_collective.h>}}
\label{upc-collective}
\index{collective libarary}
\index{\_\_UPC\_COLLECTIVE\_\_}
\index{upc\_collective.h}

\npf Implementations that support this interface shall predefine the
    feature macro {\tt \_\_UPC\_COLLECTIVE\_\_} to the value 1.

\np The following requirements apply to all of the functions defined
in Section~\ref{upc-collective}.

\np All of the functions are collective.~%
\xadded[id=DB]{10}{%
  \truefootnote{Note that collective does not necessarily imply barrier synchronization.  
  The synchronization behavior of the library functions is explicitly controlled by 
  using the {\tt upc\_flag\_t flags} argument. See \upcflagsection for details.}
}

\np All collective function arguments are single-valued.

\np Collective functions may not be called between {\tt upc\_notify}
and the corresponding {\tt upc\_wait}.

\subsubsection{Standard headers}

\np The standard header is

{\tt <upc\_collective.h>}

\np \xadded[id=DB]{10}{
    Every inclusion of {\tt <upc\_collective.h>} has the effect of including {\tt <upc\_types.h>}.
    }

\subsubsection{Relocalization Operations}

\paragraph{The {\tt upc\_all\_broadcast} function}
\index{upc\_all\_broadcast}
\index{broadcast}

{\bf Synopsis} 

\npf\vspace{-2.5em}
\begin{verbatim}
    #include <upc_collective.h>
    void upc_all_broadcast(shared void * restrict dst, 
         shared const void * restrict src, size_t nbytes, 
         upc_flag_t flags);
\end{verbatim}

{\bf Description} 

\np The {\tt upc\_all\_broadcast} function copies a block of memory with
affinity to a single thread to a block of shared memory on each thread.
The number of bytes in each block is {\tt nbytes}.

\np{\tt nbytes} must be strictly greater than 0.

\np The {\tt upc\_all\_broadcast} function treats the {\tt src} pointer
as if it pointed to a shared memory area with the type:

\begin{verbatim}
    shared [] char[nbytes]
\end{verbatim}  

\np The effect is equivalent to copying the entire array pointed to by
{\tt src} to each block of {\tt nbytes} bytes of a shared
array {\tt dst} with the type:

\begin{verbatim}
    shared [nbytes] char[nbytes * THREADS]
\end{verbatim}  

\np The target of the {\tt dst} pointer must have affinity to
thread 0.

\np The {\tt dst} pointer is treated as if it has phase 0.

\np EXAMPLE 1 shows {\tt upc\_all\_broadcast}
\begin{verbatim}
  #include <upc_collective.h>
  shared int A[THREADS];
  shared int B[THREADS];
  // Initialize A.
  upc_barrier;
  upc_all_broadcast( B, &A[1], sizeof(int),
                     UPC_IN_NOSYNC | UPC_OUT_NOSYNC );
  upc_barrier;
\end{verbatim}

\np EXAMPLE 2:
\begin{verbatim}
  #include <upc_collective.h>
  #define NELEMS 10
  shared [] int A[NELEMS];
  shared [NELEMS] int B[NELEMS*THREADS];
  // Initialize A.
  upc_all_broadcast( B, A, sizeof(int)*NELEMS,
                     UPC_IN_ALLSYNC | UPC_OUT_ALLSYNC );
\end{verbatim}

\np EXAMPLE 3 shows {\tt (A[3],A[4])} is broadcast to
{\tt (B[0],B[1])}, {\tt (B[10],B[11])}, \\
{\tt (B[20],B[21])}, ..., 
{\tt (B[NELEMS*(THREADS-1)],B[NELEMS*(THREADS-1)+1])}.
\begin{verbatim}
  #include <upc_collective.h>
  #define NELEMS 10
  shared [NELEMS] int A[NELEMS*THREADS];
  shared [NELEMS] int B[NELEMS*THREADS];
  // Initialize A.
  upc_barrier;
  upc_all_broadcast( B, &A[3], sizeof(int)*2,
                     UPC_IN_NOSYNC | UPC_OUT_NOSYNC );
  upc_barrier;
\end{verbatim}

\paragraph{The {\tt upc\_all\_scatter} function}

{\bf Synopsis} 
\index{upc\_all\_scatter}
\index{scatter}

\npf\vspace{-2.5em}
\begin{verbatim}
    #include <upc_collective.h>
    void upc_all_scatter(shared void * restrict dst, 
         shared const void * restrict src, size_t nbytes, 
         upc_flag_t flags);
\end{verbatim}

{\bf Description} 

\np The {\tt upc\_all\_scatter} function copies the $i$th block of an
area of shared memory with affinity to a single thread
to a block of shared memory with affinity to the $i$th thread.
The number of bytes in each block is {\tt nbytes}.

\np{\tt nbytes} must be strictly greater than 0.

\np The {\tt upc\_all\_scatter} function treats the {\tt src} pointer
as if it pointed to a shared memory area with the type:

\begin{verbatim}
    shared [] char[nbytes * THREADS]
\end{verbatim}  

\np and it treats the {\tt dst} pointer as if it pointed to a shared
memory area with the type:

\begin{verbatim}
    shared [nbytes] char[nbytes * THREADS]
\end{verbatim}  

\np The target of the {\tt dst} pointer must have affinity to thread 0.

\np The {\tt dst} pointer is treated as if it has phase 0.

\np For each thread $i$, the effect is equivalent to copying
the $i$th block of {\tt nbytes} bytes pointed to by {\tt src} to
the block of {\tt nbytes} bytes 
pointed to by {\tt dst} that has affinity to thread $i$.

\np EXAMPLE 1 {\tt upc\_all\_scatter} for the {dynamic THREADS} translation
environment.
%This example corresponds to Figure \ref{fig2}.

\begin{verbatim}
  #include <upc_collective.h>
  #define NUMELEMS 10
  #define SRC_THREAD 1
  shared int *A;
  shared [] int *myA, *srcA;
  shared [NUMELEMS] int B[NUMELEMS*THREADS];

  // allocate and initialize an array distributed across all threads
  A = upc_all_alloc(THREADS, THREADS*NUMELEMS*sizeof(int));
  myA = (shared [] int *) &A[MYTHREAD];
  for (i=0; i<NUMELEMS*THREADS; i++)
      myA[i] = i + NUMELEMS*THREADS*MYTHREAD;   // (for example)
  // scatter the SRC_THREAD's row of the array
  srcA = (shared [] int *) &A[SRC_THREAD];
  upc_barrier;
  upc_all_scatter( B, srcA, sizeof(int)*NUMELEMS,
                   UPC_IN_NOSYNC | UPC_OUT_NOSYNC);
  upc_barrier;
\end{verbatim}

\np EXAMPLE 2 {\tt upc\_all\_scatter} for the {\em static THREADS} 
translation environment.

\begin{verbatim}
  #include <upc_collective.h>
  #define NELEMS 10
  shared [] int A[NELEMS*THREADS];
  shared [NELEMS] int B[NELEMS*THREADS];
  // Initialize A.
  upc_all_scatter( B, A, sizeof(int)*NELEMS,
                   UPC_IN_ALLSYNC | UPC_OUT_ALLSYNC );
\end{verbatim}

\paragraph{The {\tt upc\_all\_gather} function}

{\bf Synopsis} 
\index{upc\_all\_gather}
\index{gather}

\npf\vspace{-2.5em} 

\begin{verbatim}
    #include <upc_collective.h>
    void upc_all_gather(shared void * restrict dst,
         shared const void * restrict src, size_t nbytes,
         upc_flag_t flags);
\end{verbatim}

{\bf Description} 

\np The {\tt upc\_all\_gather} function copies a block of shared memory
that has affinity to the $i$th thread to the $i$th block
of a shared memory area that has affinity to a single thread.
The number of bytes in each block is {\tt nbytes}.

\np{\tt nbytes} must be strictly greater than 0.

\np The {\tt upc\_all\_gather} function treats the {\tt src} pointer
as if it pointed to a shared memory area of {\tt nbytes} bytes on each
thread and therefore had type:

\begin{verbatim}
    shared [nbytes] char[nbytes * THREADS]
\end{verbatim}  

\np and it treats the {\tt dst} pointer as if it pointed to a shared
memory area with the type:

\begin{verbatim}
    shared [] char[nbytes * THREADS]
\end{verbatim} 

\np The target of the {\tt src} pointer must have affinity to thread 0.

\np The {\tt src} pointer is treated as if it has phase 0.

\np For each thread $i$, the effect is equivalent to copying
the block of {\tt nbytes} bytes
pointed to by {\tt src} that has affinity to thread $i$
to the $i$th block of {\tt nbytes} bytes pointed to by {\tt dst}.

\np EXAMPLE 1 {\tt upc\_all\_gather} for the {\em static THREADS}
translation environment.

\begin{verbatim}
  #include <upc_collective.h>
  #define NELEMS 10
  shared [NELEMS] int A[NELEMS*THREADS];
  shared [] int B[NELEMS*THREADS];
  // Initialize A.
  upc_all_gather( B, A, sizeof(int)*NELEMS,
                  UPC_IN_ALLSYNC | UPC_OUT_ALLSYNC );
\end{verbatim}

\np EXAMPLE 2 {\tt upc\_all\_gather} for the {\em dynamic THREADS}
translation environment.

\begin{verbatim}
  #include <upc.h>
  #include <upc_collective.h>
  #define NELEMS 10
  shared [NELEMS] int A[NELEMS*THREADS];
  shared [] int *B;
  B = (shared [] int *) upc_all_alloc(1,NELEMS*THREADS*sizeof(int));
  // Initialize A.
  upc_barrier;
  upc_all_gather( B, A, sizeof(int)*NELEMS,
                  UPC_IN_NOSYNC | UPC_OUT_NOSYNC );
  upc_barrier;
\end{verbatim}

\paragraph{The {\tt upc\_all\_gather\_all} function}

{\bf Synopsis} 
\index{upc\_all\_gather\_all}
\index{gather, to all}

\npf\vspace{-2.5em} 
\begin{verbatim}
    #include <upc_collective.h>
    void upc_all_gather_all(shared void * restrict dst,
         shared const void * restrict src, size_t nbytes,
         upc_flag_t flags);
\end{verbatim}

{\bf Description} 

\np
The {\tt upc\_all\_gather\_all} function copies a block of memory from one
shared memory area with affinity to the $i$th thread to the $i$th block
of a shared memory area on each thread.
The number of bytes in each block is {\tt nbytes}.

\np{\tt nbytes} must be strictly greater than 0.

\np The {\tt upc\_all\_gather\_all} function treats the {\tt src} pointer
as if it pointed to a shared memory area of {\tt nbytes} bytes on each
thread and therefore had type:

\begin{verbatim}
    shared [nbytes] char[nbytes * THREADS]
\end{verbatim}  

\np and it treats the {\tt dst} pointer as if it pointed to a shared
memory area with the type:

\begin{verbatim}
    shared [nbytes * THREADS] char[nbytes * THREADS * THREADS]
\end{verbatim} 

\np The targets of the {\tt src} and {\tt dst} pointers
must have affinity to thread 0.

\np The {\tt src} and {\tt dst} pointers are treated as
if they have phase 0.

\np
The effect is equivalent to copying the
$i$th block of {\tt nbytes} bytes pointed to by {\tt src} to the
$i$th block of {\tt nbytes} bytes pointed to by {\tt dst} that
has affinity to each thread.

\np EXAMPLE 1 {\tt upc\_all\_gather\_all} for the {\em static THREADS}
translation environment.

\begin{verbatim}
  #include <upc_collective.h>
  #define NELEMS 10
  shared [NELEMS] int A[NELEMS*THREADS];
  shared [NELEMS*THREADS] int B[THREADS][NELEMS*THREADS];
  // Initialize A.
  upc_barrier;
  upc_all_gather_all( B, A, sizeof(int)*NELEMS,
                      UPC_IN_NOSYNC | UPC_OUT_NOSYNC );
  upc_barrier;
\end{verbatim}

\np EXAMPLE 2 {\tt upc\_all\_gather\_all} for the {\em dynamic THREADS}
translation environment.

\begin{verbatim}
  #include <upc.h>
  #include <upc_collective.h>
  #define NELEMS 10
  shared [NELEMS] int A[NELEMS*THREADS];
  shared int *Bdata;
  shared [] int *myB;

  Bdata = upc_all_alloc(THREADS*THREADS, NELEMS*sizeof(int));
  myB = (shared [] int *)&Bdata[MYTHREAD];

  // Bdata contains THREADS*THREADS*NELEMS elements.
  // myB is MYTHREAD's row of Bdata.
  // Initialize A.
  upc_all_gather_all( Bdata, A, NELEMS*sizeof(int),
                      UPC_IN_ALLSYNC | UPC_OUT_ALLSYNC );
\end{verbatim}

\paragraph{The {\tt upc\_all\_exchange} function}
\index{upc\_all\_exchange}
\index{exchange}

{\bf Synopsis} 

\npf\vspace{-2.5em}
\begin{verbatim}
    #include <upc_collective.h>
    void upc_all_exchange(shared void * restrict dst, 
         shared const void * restrict src, size_t nbytes,
         upc_flag_t flags);
\end{verbatim}

{\bf Description} 

\np The {\tt upc\_all\_exchange} function copies the $i$th block of memory
from a shared memory area that has affinity to thread $j$ to the $j$th block
of a shared memory area that has affinity to thread $i$.
The number of bytes in each block is {\tt nbytes}.

\np{\tt nbytes} must be strictly greater than 0.

\np The {\tt upc\_all\_exchange} function treats the {\tt src} pointer
and the {\tt dst} pointer as if each
pointed to a shared memory area of {\tt nbytes}$*${\tt THREADS} bytes
on each thread and therefore had type:

\begin{verbatim}
    shared [nbytes * THREADS] char[nbytes * THREADS * THREADS]
\end{verbatim}  

\np The targets of the {\tt src} and {\tt dst} pointers
must have affinity to thread 0.

\np The {\tt src} and {\tt dst} pointers are treated as
if they have phase 0.

\np For each pair of threads $i$ and $j$, the effect is equivalent to copying
the $i$th block of {\tt nbytes} bytes that has affinity to thread $j$
pointed to by {\tt src}
to
the $j$th block of {\tt nbytes} bytes that has affinity to thread $i$ 
pointed to by {\tt dst}.

\np EXAMPLE 1 {\tt upc\_all\_exchange} for the {\em static THREADS}
translation environment.

\begin{verbatim}
  #include <upc_collective.h>
  #define NELEMS 10
  shared [NELEMS*THREADS] int A[THREADS][NELEMS*THREADS];
  shared [NELEMS*THREADS] int B[THREADS][NELEMS*THREADS];
  // Initialize A.
  upc_barrier;
  upc_all_exchange( B, A, NELEMS*sizeof(int),
                    UPC_IN_NOSYNC | UPC_OUT_NOSYNC );
  upc_barrier;
\end{verbatim}

\np EXAMPLE 2 {\tt upc\_all\_exchange} for the {\em dynamic THREADS}
translation environment.

\begin{verbatim}
  #include <upc.h>
  #include <upc_collective.h>
  #define NELEMS 10
  shared int *Adata, *Bdata;
  shared [] int *myA, *myB;
  int i;

  Adata = upc_all_alloc(THREADS*THREADS, NELEMS*sizeof(int));
  myA = (shared [] int *)&Adata[MYTHREAD];
  Bdata = upc_all_alloc(THREADS*THREADS, NELEMS*sizeof(int));
  myB = (shared [] int *)&Bdata[MYTHREAD];

  // Adata and Bdata contain THREADS*THREADS*NELEMS elements.
  // myA and myB are MYTHREAD's rows of Adata and Bdata, resp.

  // Initialize MYTHREAD's row of A.  For example,
  for (i=0; i<NELEMS*THREADS; i++)
      myA[i] = MYTHREAD*10 + i;

  upc_all_exchange( Bdata, Adata, NELEMS*sizeof(int),
                    UPC_IN_ALLSYNC | UPC_OUT_ALLSYNC );
\end{verbatim}

\paragraph{The {\tt upc\_all\_permute} function}
\index{upc\_all\_permute}
\index{permute}

{\bf Synopsis} 

\npf\vspace{-2.5em}
\begin{verbatim}
    #include <upc_collective.h>
    void upc_all_permute(shared void * restrict dst,
        shared const void * restrict src, 
        shared const int * restrict perm,
        size_t nbytes, upc_flag_t flags);
\end{verbatim}

{\bf Description} 

\np The {\tt upc\_all\_permute} function copies a block of memory from a
shared memory area that has affinity to the $i$th thread to a block of a
shared memory that has affinity to thread {\tt perm[i]}.
The number of bytes in each block is {\tt nbytes}.

\np{\tt nbytes} must be strictly greater than 0.

\np{\tt perm[0..THREADS-1]} must contain {\tt THREADS} distinct
values: {\tt 0, 1, ...,  THREADS-1}.

\np The {\tt upc\_all\_permute} function treats the {\tt src} pointer
and the {\tt dst} pointer as if each pointed to a shared memory
area of {\tt nbytes} bytes on each thread and therefore had type:

\begin{verbatim}
    shared [nbytes] char[nbytes * THREADS]
\end{verbatim}  

\np The targets of the {\tt src}, {\tt perm}, and
{\tt dst} pointers must have affinity to thread 0.

\np The {\tt src} and {\tt dst} pointers are treated as
if they have phase 0.

\np The effect is equivalent to copying the block of {\tt nbytes} bytes
that has affinity to thread {\tt i} pointed to by {\tt src}
to the block of {\tt nbytes} bytes
that has affinity to thread {\tt perm}[$i$] pointed to by {\tt dst}.

\np EXAMPLE 1 {\tt upc\_all\_permute}.
\begin{verbatim}
  #include <upc_collective.h>
  #define NELEMS 10
  shared [NELEMS] int A[NELEMS*THREADS], B[NELEMS*THREADS];
  shared int P[THREADS];
  // Initialize A and P.
  upc_barrier;
  upc_all_permute( B, A, P, sizeof(int)*NELEMS,
                   UPC_IN_NOSYNC | UPC_OUT_NOSYNC );
  upc_barrier;
\end{verbatim}

\subsubsection{Computational Operations}
\label{upc-op-t-section}
\index{UPC\_FUNC}
\index{UPC\_NONCOMM\_FUNC}
\xchangenote[id=DB]{10}{DEFINITION OF UPC\_OP\_T MOVED TO CORE LIBRARY}

\npf Computational operations are specified using a value of type
{\tt upc\_op\_t}, which is specified in \upcopsection. All of the 
operations defined in that section are supported for computational collectives.

In addition, the following {\tt upc\_op\_t} value macros 
are defined in {\tt <upc\_collective.h>}:
\begin{description}
\item[{\tt UPC\_FUNC}]
Use the specified commutative function {\tt func} to operate
on the data in the {\tt src} array at each step.
\item[{\tt UPC\_NONCOMM\_FUNC}]
Use the specified non-commutative function {\tt func} to
operate on the data in the {\tt src} array at each step.
\end{description}

\np Bitwise operations shall not be specified for collective 
    operations on floating-point types.

\np The operations represented by a variable of type {\tt upc\_op\_t}
(including user-provided operators) are assumed to be associative.
A reduction or a prefix reduction whose result is dependent on the
order of operator evaluation will have undefined results.\footnote{
Implementations are not obligated to prevent failures that
might arise because of a lack of associativity of built-in functions
due to floating-point roundoff or overflow.}

\np The operations represented by a variable of type {\tt upc\_op\_t}
(except those provided using {\tt UPC\_NONCOMM\_FUNC}) are assumed
to be commutative.  A reduction or a prefix reduction (using operators
other than {\tt UPC\_NONCOMM\_FUNC}) whose result is dependent on
the order of the operands will have undefined results.

{\bf Forward references:} reduction, prefix reduction (\ref{reduction}).

\paragraph{The {\tt upc\_all\_reduce} and {\tt upc\_all\_prefix\_reduce} functions}
\label{reduction}

{\bf Synopsis} 
\index{upc\_all\_reduce}
\index{upc\_all\_reduce\_prefix}
\index{reduction}
\index{prefix reduction}

\npf 
\begin{verbatim}
#include <upc_collective.h>
void upc_all_reduce_<<T>>(
        shared void * restrict dst,
        void shared const void * restrict src,
        upc_op_t op,
        size_t nelems,
        void size_t blk_size,
        <<TYPE>>(*func)(<<TYPE>>, <<TYPE>>),
        upc_flag_t flags);
void upc_all_prefix_reduce<<T>>(
        shared void * restrict dst,
        void  shared const void * restrict src,
        upc_op_t op,
        size_t nelems,
        void size_t blk_size,
        <<TYPE>>(*func)(<<TYPE>>, <<TYPE>>),
        upc_flag_t flags);
\end{verbatim}

{\bf Description} 

\np The function prototypes above represents the 22 variations of the
  {\tt upc\_all\_reduce{\em T}} and {\tt upc\_all\_prefix\_reduce{\em T}} 
  functions where {\tt{\em T}} and {\tt{\em TYPE}} have the following 
correspondences: \footnote{For example, if {\tt{\em T}} is {\tt C}, then 
{\tt{\em TYPE}} must be {\tt signed char}.}
\begin{center}
\begin{tabular}{ll|ll}
{\tt{\em T}} & {\tt{\em TYPE}} \hspace*{1.5in} &
{\tt{\em T}} & {\tt{\em TYPE}} \\ \hline
{\tt C} & {\tt signed char} &
{\tt L} & {\tt signed long} \\
{\tt UC} & {\tt unsigned char} &
{\tt UL} & {\tt unsigned long} \\
{\tt S} & {\tt signed short} &
{\tt F} & {\tt float} \\
{\tt US} & {\tt unsigned short} &
{\tt D} & {\tt double} \\
{\tt I} & {\tt signed int} &
{\tt LD} & {\tt long double} \\
{\tt UI} & {\tt unsigned int} &
\end{tabular}
\end{center}

\np On completion of the {\tt upc\_all\_reduce} variants, 
the value of the {\tt{\em TYPE}} shared object
referenced by {\tt dst} is
{\tt src[0]} $\oplus$ {\tt src[1]} $\oplus \cdots \oplus$
{\tt src[nelems-1]}
where ``$\oplus$'' is the operator specified by the variable {\tt op}.

\np On completion of the {\tt upc\_all\_prefix\_reduce} variants, 
the value of the {\tt{\em TYPE}} shared object
referenced by {\tt dst[i]} is
{\tt src[0]} $\oplus$ {\tt src[1]}
$\oplus \cdots \oplus$ {\tt src[i]}
for $0 \leq$ {\tt i} $\leq$ {\tt nelems-1} and
where ``$\oplus$'' is the operator specified by the variable {\tt op}.

\np
If the value of {\tt blk\_size} passed to these functions is
greater than 0 then they treat the {\tt src} pointer
as if it pointed to a shared memory area of {\tt nelems} elements of
type {\tt{\em TYPE}} and blocking factor {\tt blk\_size}, and therefore
had type:

\begin{verbatim}
    shared [blk_size] TYPE [nelems]
\end{verbatim}

\np
If the value of {\tt blk\_size} passed to these functions is
0 then they treat the {\tt src} pointer
as if it pointed to a shared memory area of {\tt nelems} elements of
type {\tt{\em TYPE}} with an indefinite layout qualifier, and
therefore had
type\footnote{Note that {\tt upc\_blocksize(src) == 0} if
{\tt src} has this type, so the argument value 0 has a natural
connection to the block size of {\tt src}.}:

\begin{verbatim}
    shared []  TYPE[nelems]
\end{verbatim}

\np The phase of the {\tt src} pointer is respected when
referencing array elements, as specified above.

\np{\tt upc\_all\_prefix\_reduce{\em T}} treats the {\tt dst} pointer
    equivalently to the {\tt src} pointer as described in the past 3
    paragraphs.
    
\np{\tt upc\_all\_prefix\_reduce{\em T}} requires the affinity and
phase of the {\tt src} and {\tt dst} pointers to match -- ie. 
{\tt upc\_threadof(src) == upc\_threadof(dst) \&\& upc\_phaseof(src) == upc\_phaseof(dst)}.

\np{\tt upc\_all\_reduce{\em T}} treats the {\tt dst} pointer as having type:

\begin{verbatim}
    shared TYPE *
\end{verbatim}

\np EXAMPLE 1 {\tt upc\_all\_reduce} of type {\tt long UPC\_ADD}.
\begin{verbatim}
  #include <upc_collective.h>
  #define BLK_SIZE 3
  #define NELEMS 10
  shared [BLK_SIZE] long A[NELEMS*THREADS];
  shared long *B;
  long result;
  // Initialize A.  The result below is defined only on thread 0.
  upc_barrier;
  upc_all_reduceL( B, A, UPC_ADD, NELEMS*THREADS, BLK_SIZE,
                   NULL, UPC_IN_NOSYNC | UPC_OUT_NOSYNC );
  upc_barrier;
\end{verbatim}

%\paragraph{The {\tt upc\_all\_prefix\_reduce} function}
%\label{prefix-reduction}
%
% {\bf Synopsis} 
%
%1\hspace{1em}   \\
%        {\tt \#include <upc\_collective.h>}\\
%        {\tt void {upc\_all\_prefix\_reduce}{\em T}(restrict shared void *dst, \\
%         \phantom{void {upc\_all\_prefix\_reduce}{\em T}(}restrict shared const void *src,\\
%\phantom{void {upc\_all\_prefix\_reduce}{\em T}(}upc\_op\_t op, size\_t nelems, size\_t blk\_size,\\
%\phantom{void {upc\_all\_prefix\_reduce}{\em T}(}{\em TYPE} (*func)({\em TYPE}, {\em TYPE}), \\
%\phantom{void {upc\_all\_prefix\_reduce}{\em T}(}upc\_flag\_t flags);} \\
%
% {\bf Description} 
%
%1\hspace{1em} The function prototype above represents the 11 variations of the
%{\tt upc\_all\_reduce{\em T}} function where {\tt{\em T}} and {\tt{\em TYPE}} have the
%following correspondences:
%
%\begin{center}
%\begin{tabular}{ll|ll}
%{\tt{\em T}} & {\tt{\em TYPE}} \hspace*{1.5in} &
%{\tt{\em T}} & {\tt{\em TYPE}} \\ \hline
%{\tt C} & {\tt signed char} &
%{\tt L} & {\tt signed long} \\
%{\tt UC} & {\tt unsigned char} &
%{\tt UL} & {\tt unsigned long} \\
%{\tt S} & {\tt signed short} &
%{\tt F} & {\tt float} \\
%{\tt US} & {\tt unsigned short} &
%{\tt D} & {\tt double} \\
%{\tt I} & {\tt signed int} &
%{\tt LD} & {\tt long double} \\
%{\tt UI} & {\tt unsigned int} &
%\end{tabular}
%\end{center}
%
%2\hspace{1em} For example, if {\tt{\em T}} is {\tt C}, then {\tt{\em TYPE}} must be {\tt signed char}.
%
%3\hspace{1em}
%\label{prefix-reduce-blksize-gt-0}
%If the value of {\tt blk\_size} passed to {\tt upc\_all\_prefix\_reduce{\em T}} is
%greater than 0 then
%{\tt upc\_all\_prefix\_reduce{\em T}} treats the {\tt src} pointer and the
%{\tt dst} pointer
%as if each pointed to a shared memory area of {\tt nelems} elements of
%type {\tt{\em TYPE}} and blocking factor {\tt blk\_size}, and therefore
%had type:
%
%\begin{verbatim}
%    shared [blk_size}] TYPE[nelems]
%\end{verbatim}
%
%4\hspace{1em}
%\label{prefix-reduce-blksize-0}
%If the value of {\tt blk\_size} passed to {\tt upc\_all\_prefix\_reduce{\em T}} is
%0 then \\
%{\tt upc\_all\_prefix\_reduce{\em T}} treats the {\tt src} pointer and the
%{\tt dst} pointer
%as if each pointed to a shared memory area of {\tt nelems} elements of
%type {\tt{\em TYPE}} with an indefinite layout qualifier, and
%therefore had
%type\footnote{Note that {\tt upc\_blocksize(src) == 0} if
%{\tt src} has this type, so the argument value 0 has a natural
%connection to the block size of {\tt src}.}:
%
%\begin{verbatim}
%    shared [] TYPE [nelems]
%\end{verbatim}
%
%% 1\hspace{1em} The targets of the {\tt src} and {\tt dst} pointers
%% must have affinity to thread 0.
%
%5\hspace{1em} The phases of the {\tt src} and {\tt dst} pointers are
%respected when referencing array elements, as specified in items
%\ref{prefix-reduce-blksize-gt-0} and \ref{prefix-reduce-blksize-0} above.
%
%6\hspace{1em} At function exit
%{\tt dst[i]} = {\tt src[0]} $\oplus$ {\tt src[1]}
%$\oplus \cdots \oplus$ {\tt src[i]}
%for $0 \leq$ {\tt i} $\leq$ {\tt nelems-1} and
%where ``$\oplus$'' is the operator specified by the variable {\tt op}.
%
\np EXAMPLE 2 {\tt upc\_all\_prefix\_reduce} of type {\tt long UPC\_ADD}.

\begin{verbatim}
  #include <upc_collective.h>
  #define BLK_SIZE 3
  #define NELEMS 10
  shared [BLK_SIZE] long A[NELEMS*THREADS];
  shared [BLK_SIZE] long B[NELEMS*THREADS];
  // Initialize A.
  upc_all_prefix_reduceL( B, A, UPC_ADD, NELEMS*THREADS, BLK_SIZE,
                          NULL, UPC_IN_ALLSYNC | UPC_OUT_ALLSYNC );
\end{verbatim}

